{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-24T03:03:30.153215900Z",
     "start_time": "2023-11-24T03:03:26.954216400Z"
    },
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import os\n",
    "import gensim\n",
    "from gensim.corpora import Dictionary\n",
    "from gensim import corpora\n",
    "import logging\n",
    "import glob\n",
    "import spacy\n",
    "from warnings import filterwarnings\n",
    "from pandas import DataFrame\n",
    "import numpy as np\n",
    "from gensim.models import LdaModel, CoherenceModel, Word2Vec\n",
    "from tqdm import tqdm\n",
    "\n",
    "filterwarnings('ignore', category=DeprecationWarning)\n",
    "logging.basicConfig(level=logging.ERROR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "488cbabbfb48268e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-11-23T11:20:12.707700700Z",
     "start_time": "2023-11-23T11:20:12.689078300Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "class EnLDA:\n",
    "    def __init__(self, data_frame: DataFrame):\n",
    "        self.nlp = spacy.load(\"en_core_web_sm\")\n",
    "        # 删除重复行\n",
    "        data_frame = data_frame.drop_duplicates(keep='last')\n",
    "        # 去掉缺失值\n",
    "        data_frame = data_frame.dropna(subset=['职位', '任职要求'])\n",
    "        self.df = data_frame\n",
    "\n",
    "    def participle(self, text):\n",
    "        data = list()\n",
    "        doc = self.nlp(text)\n",
    "        for word in doc:  # 分词\n",
    "            if not word.is_stop and not word.is_punct and not word.like_num and word.text != 'I':\n",
    "                data.append(word.lemma_)\n",
    "        return data\n",
    "\n",
    "    def run(self):\n",
    "        print(\"分词中~\")\n",
    "        texts = self.df['任职要求'].apply(func=self.participle).tolist()\n",
    "        print(texts)\n",
    "        # # 创建二元语法模型\n",
    "        # big_gram = gensim.models.phrases.Phrases(self.df['任职要求'])\n",
    "        # # 应用二元语法模型\n",
    "        # texts = [big_gram[line] for line in self.df['任职要求']]\n",
    "        # # 词典将所有文本中的单词和短语映射到唯一的数字标识符\n",
    "        # dictionary = Dictionary(texts)\n",
    "        # # 创建语料库和词典\n",
    "        # corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "\n",
    "        print(\"开始创建语料库~\")\n",
    "        # 词典将所有文本中的单词和短语映射到唯一的数字标识符\n",
    "        dictionary = Dictionary(texts)\n",
    "        # 创建语料库和词典\n",
    "        corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "        print(\"LDA主题建模~\")\n",
    "\n",
    "        def evaluate_lda_model(texts):\n",
    "            for k in tqdm(range(start_k, end_k + 1)):\n",
    "                # 训练LDA模型\n",
    "                print(\"训练LDA模型\")\n",
    "                lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=k)\n",
    "                # 计算一致性得分\n",
    "                print(\"计算一致性得分\")\n",
    "                coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
    "                coherence_score = coherence_model.get_coherence()\n",
    "                coherence_scores.append(coherence_score)\n",
    "                print(f\"主题系数 (K): {k}, 一致性指数评分: {coherence_score}\")\n",
    "            return coherence_scores\n",
    "\n",
    "        # 选择主题数量范围\n",
    "        start_k = 2\n",
    "        end_k = 10\n",
    "        # 进行LDA建模和一致性评估\n",
    "        coherence_scores = evaluate_lda_model(texts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4832830d0cc025f3",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-11-23T11:20:12.706701400Z"
    },
    "collapsed": false,
    "is_executing": true,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "当前网站: boss直聘\n",
      "当前网站: CareerBuilder\n",
      "分词中~\n",
      "[['Optum', 'global', 'organization', 'deliver', 'care', 'aid', 'technology', 'help', 'million', 'people', 'live', 'healthy', 'life', 'work', 'team', 'directly', 'improve', 'health', 'outcome', 'connect', 'people', 'care', 'pharmacy', 'benefit', 'datum', 'resource', 'need', 'feel', 'good', 'find', 'culture', 'guide', 'diversity', 'inclusion', 'talented', 'peer', 'comprehensive', 'benefit', 'career', 'development', 'opportunity', 'come', 'impact', 'community', 'serve', 'help', 'advance', 'health', 'equity', 'global', 'scale', 'join', 'start', 'Caring', 'connect', 'grow', 'enjoy', 'flexibility', 'work', 'remotely', 'America', 'tough', 'challenge', 'ask', 'work', 'hybrid', 'set', 'hybrid', 'Work', 'Schedule', 'day', 'week', 'Lansing', 'office', 'day', 'week', 'remote', 'potential', 'work', 'Request', 'Description', 'state', 'give', 'prior', 'position', 'Data', 'Warehouse', 'Business', 'Intelligence', 'Analyst', 'work', 'end', 'user', 'provide', 'information', 'number', 'different', 'way', 'include', 'task', 'run', 'Joint', 'Application', 'Design', 'JAD', 'session', 'understand', 'datum', 'need', 'likely', 'create', 'ad', 'hoc', 'query', 'answer', 'specific', 'user', 'question', 'program', 'design', 'specification', 'create', 'datum', 'warehouse', 'base', 'application', 'include', 'reporting', 'dashboard', 'information', 'analysis', 'delivery', 'system', 'support', 'end', 'user', 'enable', 'create', 'query', 'report', 'need', 'identify', 'data', 'problem', 'work', 'developer', 'correct', 'advanced', 'analytic', 'provide', 'agency', 'predictive', 'explanatory', 'analysis', 'enable', 'well', 'accomplish', 'mission', 'organize', 'run', 'user', 'group', 'encourage', 'self', 'support', 'end', 'user', 'DTMB', 'query', 'BI', 'application', 'developer', 'Data', 'Warehouse', 'Business', 'Intelligence', 'Analyst', 'expect', 'able', 'work', 'agency', 'requester', 'define', 'develop', 'new', 'etl', 'code', 'query', 'provide', 'training', 'technical', 'support', 'DTMB', 'developer', 'end', 'user', 'task', 'include', 'work', 'end', 'user', 'requester', 'understand', 'datum', 'need', 'create', 'implement', 'solution', 'need', 'develop', 'present', 'training', 'class', 'end', 'user', 'developer', 'help', 'learn', 'effectively', 'use', 'data', 'warehouse', 'platform', 'BI', 'tool', 'available', 'support', 'datum', 'need', 'possible', 'develop', 'present', 'training', 'class', 'DTMB', 'developer', 'well', 'understand', 'use', 'BI', 'development', 'tool', 'create', 'implement', 'reporting', 'solution', 'data', 'warehouse', 'back', 'Teradata', 'provide', 'support', 'developer', 'end', 'user', 'assist', 'accomplish', 'datum', 'reporting', 'need', 'reward', 'recognize', 'performance', 'environment', 'challenge', 'clear', 'direction', 'take', 'succeed', 'role', 'provide', 'development', 'role', 'interested', 'Required', 'Qualifications', 'bachelor', 'degree', 'equivalent', 'technical', 'study', '+', 'year', 'Business', 'Lead', 'Business', 'Intelligence', 'Analyst', 'experience', '+', 'year', 'create', 'Data', 'Warehouse', 'BI', 'application', 'include', 'etl', 'datum', 'mart', 'development', 'reporting', 'dashboard', 'information', 'analysis', 'delivery', 'system', '+', 'year', 'etl', 'SQL', 'development', 'experience', '+', 'year', 'Child', 'Welfare', 'Services', 'Juvenile', 'Justice', 'Juvenile', 'service', 'related', 'experience', 'local', 'Lansing', 'MI', 'ability', 'work', 'day', 'week', 'office', 'Preferred', 'qualification', 'experience', 'State', 'Michigan', 'pmm', 'SUITE', 'agile', 'methodology', 'SQL', 'Assistant', 'Teradata', 'Studio', 'experience', 'end', 'user', 'training', 'experience', 'experience', 'work', 'Data', 'Modelers', 'developer', 'facilitate', 'modification', 'logical', 'design', 'create', 'physical', 'design', 'suit', 'reporting', 'need', 'customer', 'prove', 'ability', 'work', 'independently', 'rely', 'State', 'team', 'member', 'prove', 'communicate', 'effectively', 'verbally', 'writing', 'programmer', 'analyst', 'immediate', 'supervisor', 'management', 'system', 'user', 'operation', 'State', 'agency', 'private', 'sector', 'prove', 'ability', 'meet', 'user', 'clarification', 'elaboration', 'necessary', 'clearly', 'define', 'problem', 'conceptualize', 'development', 'plan', 'prove', 'ability', 'recognize', 'gather', 'correlate', 'analyze', 'fact', 'draw', 'conclusion', 'define', 'problem', 'devise', 'solution', 'alternative', 'appropriate', 'recommendation', 'prove', 'ability', 'design', 'application', 'take', 'consideration', 'datum', 'reconciliation', 'source', 'system', 'query', 'reporting', 'performance', 'analytical', 'business', 'value', 'career', 'Optum', 'objective', 'health', 'care', 'simple', 'effective', 'hand', 'work', 'aspect', 'health', 'play', 'role', 'create', 'healthy', 'world', 'insight', 'connection', 'person', 'time', 'bring', 'great', 'mind', 'idea', 'health', 'care', 'full', 'potential', 'promote', 'health', 'equity', 'accessibility', 'work', 'diverse', 'engaged', 'high', 'perform', 'team', 'help', 'solve', 'important', 'challenge', 'California', 'Colorado', 'Connecticut', 'Nevada', 'New', 'York', 'Rhode', 'Island', 'Washington', 'resident', 'salary', 'range', 'California', 'Colorado', 'Connecticut', 'Nevada', 'New', 'York', 'Rhode', 'Island', 'Washington', 'resident', 'dollar85000', 'dollar167300', 'pay', 'base', 'factor', 'include', 'limit', 'education', 'work', 'experience', 'certification', 'etc', 'addition', 'salary', 'United', 'Health', 'Group', 'offer', 'benefit', 'comprehensive', 'benefit', 'package', 'incentive', 'recognition', 'program', 'equity', 'stock', 'purchase', '401k', 'contribution', 'benefit', 'subject', 'eligibility', 'requirement', 'matter', 'begin', 'career', 'United', 'Health', 'Group', 'find', 'far', 'reach', 'choice', 'benefit', 'incentive', 'employee', 'work', 'remotely', 'require', 'adhere', 'United', 'Health', 'Group', 'Telecommuter', 'Policy', 'United', 'Health', 'Group', 'mission', 'help', 'people', 'live', 'healthy', 'life', 'health', 'system', 'work', 'well', 'believe', 'race', 'gender', 'sexuality', 'age', 'location', 'income', 'deserve', 'opportunity', 'live', 'healthy', 'life', 'today', 'far', 'barrier', 'good', 'health', 'disproportionately', 'experience', 'people', 'color', 'historically', 'marginalize', 'group', 'low', 'income', 'committed', 'mitigate', 'impact', 'environment', 'enable', 'deliver', 'equitable', 'care', 'address', 'health', 'disparity', 'improve', 'health', 'outcome', 'enterprise', 'priority', 'reflect', 'mission', 'diversity', 'create', 'healthy', 'atmosphere', 'United', 'Health', 'Group', 'Equal', 'Employment', 'Opportunity', 'Affirmative', 'Action', 'employer', 'qualified', 'applicant', 'receive', 'consideration', 'employment', 'regard', 'race', 'color', 'religion', 'sex', 'age', 'national', 'origin', 'protect', 'veteran', 'status', 'disability', 'status', 'sexual', 'orientation', 'gender', 'identity', 'expression', 'marital', 'status', 'genetic', 'information', 'characteristic', 'protect', 'law', 'United', 'Health', 'Group', 'drug', 'free', 'workplace', 'candidate', 'require', 'pass', 'drug', 'test', 'begin', 'employment', 'recommend', 'Skills', 'Agile', 'Methodology', 'Backend', 'Business', 'Intelligence', 'child', 'Protection', 'communication', 'dashboard'], ['Grammarly', 'excited', 'offer', 'remote', 'hybrid', 'working', 'model', 'team', 'member', 'work', 'primarily', 'remotely', 'America', 'Canada', 'Ukraine', 'Germany', 'Poland', 'Portugal', 'certain', 'role', 'specific', 'location', 'requirement', 'facilitate', 'collaboration', 'particular', 'grammarly', 'hub', 'role', 'person', 'component', 'condition', 'permit', 'team', 'meet', 'week', 'quarter', 'Grammarly', 'hub', 'San', 'Francisco', 'Kyiv', 'New', 'York', 'Vancouver', 'Berlin', 'workspace', 'Krakw', 'flexible', 'approach', 'give', 'team', 'member', 'good', 'world', 'plenty', 'focus', 'time', 'person', 'collaboration', 'foster', 'trust', 'unlock', 'creativity', 'grammarly', 'team', 'member', 'role', 'base', 'America', 'Canada', 'able', 'collaborate', 'person', 'week', 'quarter', 'travel', 'necessary', 'hub', 'team', 'base', 'opportunity', 'day', 'ten', 'million', 'people', 'professional', 'team', 'rely', 'Grammarly', 'AI', 'enable', 'communication', 'assistance', 'help', 'communicate', 'confidently', 'achieve', 'goal', 'team', 'member', 'autonomy', 'exciting', 'challenge', 'pursuit', 'mission', 'improve', 'life', 'improve', 'communication', 'build', 'decade', 'steady', 'growth', 'profitability', 'define', 'communication', 'assistance', 'category', 'individual', 'enterprise', 'developer', 'tailor', 'service', 'offering', 'Grammarly', 'Premium', 'Grammarly', 'Business', 'Grammarly', 'Education', 'Grammarly', 'Developers', 'begin', 'team', 'collaborate', 'inclusive', 'value', 'drive', 'learning', 'orient', 'environment', 'achieve', 'ambitious', 'goal', 'look', 'Senior', 'Business', 'Intelligence', 'Analyst', 'enable', 'Grammarly', 'well', 'fast', 'data', 'drive', 'business', 'decision', 'making', 'high', 'impact', 'role', 'broad', 'Data', 'team', 'Grammarly', 'focus', 'build', 'generation', 'Business', 'Intelligence', 'stack', 'foster', 'data', 'drive', 'culture', 'entire', 'organization', 'empower', 'decision', 'maker', 'level', 'company', 'single', 'source', 'truth', 'Senior', 'Business', 'Intelligence', 'analyst', 'work', 'closely', 'product', 'marketing', 'business', 'leader', 'Data', 'Engineering', 'team', 'Analytics', 'professional', 'develop', 'scalable', 'Business', 'Intelligence', 'solution', 'company', 'impact', 'Senior', 'Business', 'Intelligence', 'Analyst', 'work', 'product', 'business', 'stakeholder', 'develop', 'scalable', 'Business', 'Intelligence', 'analytic', 'reporting', 'solution', 'Build', 'complex', 'production', 'grade', 'SQL', 'Python', 'pipeline', 'create', 'source', 'truth', 'key', 'Grammarly', 'metric', 'work', 'Data', 'Engineering', 'team', 'provide', 'high', 'quality', 'datum', 'requirement', 'contribute', 'strategy', 'product', 'team', 'company', 'identify', 'new', 'opportunity', 'use', 'datum', 'function', 'Grammarly', 'look', 'embody', 'eager', 'value', 'ethical', 'adaptable', 'gritty', 'empathetic', 'remarkable', 'able', 'collaborate', 'person', 'week', 'quarter', 'travel', 'necessary', 'hub', 'team', 'base', 'strong', 'datum', 'etl', 'pipeline', 'development', 'datum', 'visualization', 'experience', '+', 'year', 'work', 'experience', 'Business', 'Intelligence', 'Engineer', 'Data', 'Analyst', 'Product', 'Analyst', 'similar', 'role', 'strong', 'analytical', 'critical', 'thinking', 'skill', 'high', 'attention', 'detail', 'strong', 'bias', 'actionable', 'insight', 'passion', 'communicate', 'insight', 'language', 'datum', 'support', 'professionally', 'personally', 'professional', 'growth', 'believe', 'autonomy', 'trust', 'key', 'empower', 'team', 'member', 'good', 'innovative', 'work', 'way', 'align', 'interest', 'talent', 'support', 'professional', 'development', 'advancement', 'training', 'coaching', 'regular', 'feedback', 'connected', 'team', 'Grammarly', 'build', 'product', 'help', 'people', 'connect', 'apply', 'mindset', 'team', 'remote', 'hybrid', 'model', 'enable', 'highly', 'collaborative', 'culture', 'support', 'EAGER', 'ethical', 'adaptable', 'gritty', 'empathetic', 'remarkable', 'value', 'work', 'foster', 'belong', 'team', 'member', 'variety', 'way', 'include', 'employee', 'resource', 'group', 'Grammarly', 'Circles', 'promote', 'connection', 'share', 'identity', 'include', 'BIPOC', 'LGBTQIA', '+', 'team', 'member', 'woman', 'parent', 'celebrate', 'colleague', 'accomplishment', 'global', 'local', 'team', 'specific', 'program', 'compensation', 'Benefits', 'Grammarly', 'offer', 'team', 'member', 'competitive', 'pay', 'benefit', 'package', 'encompass', 'following', 'excellent', 'health', 'care', 'include', 'wide', 'range', 'medical', 'dental', 'vision', 'mental', 'health', 'fertility', 'benefit', 'disability', 'life', 'insurance', 'option', '401k', 'RRSP', 'match', 'pay', 'parental', 'leave', 'day', 'pay', 'time', 'year', 'day', 'pay', 'holiday', 'year', 'unlimited', 'sick', 'day', 'home', 'office', 'stipend', 'Caregiver', 'pet', 'care', 'stipend', 'Wellness', 'stipend', 'admission', 'discount', 'Learning', 'development', 'opportunity', 'Grammarly', 'take', 'market', 'base', 'approach', 'compensation', 'mean', 'base', 'pay', 'vary', 'depend', 'location', 'America', 'Canada', 'location', 'categorize', 'compensation', 'zone', 'base', 'geographic', 'region', 'cost', 'labor', 'index', 'information', 'compensation', 'zone', 'location', 'currently', 'support', 'employment', 'refer', 'page', 'location', 'interest', 'list', 'speak', 'recruiter', 'additional', 'information', 'base', 'pay', 'vary', 'considerably', 'depend', 'job', 'relate', 'knowledge', 'skill', 'experience', 'expected', 'salary', 'range', 'position', 'outline', 'compensation', 'zone', 'modify', 'future', 'America', 'dollar185000', 'dollar239000', 'year', 'America', 'D', 'dollar166000', 'dollar215000', 'year', 'America', 'D', 'encourage', 'apply', 'Grammarly', 'value', 'difference', 'encourage', 'especially', 'identity', 'traditionally', 'underrepresented', 'tech', 'organization', 'apply', 'discriminate', 'basis', 'race', 'religion', 'color', 'gender', 'expression', 'identity', 'sexual', 'orientation', 'ancestry', 'national', 'origin', 'citizenship', 'age', 'marital', 'status', 'veteran', 'status', 'disability', 'status', 'political', 'belief', 'characteristic', 'protect', 'law', 'Grammarly', 'equal', 'opportunity', 'employer', 'participant', 'America', 'federal', 'E', 'verify', 'program', 'America', 'abide', 'Employment', 'Equity', 'Act', 'Canada', 'note', 'EEOC', 'optional', 'specific', 'America', '-base', 'candidate', 'note', 'Grammarly', 'covid-19', 'vaccination', 'policy', 'require', 'team', 'member', 'North', 'America', 'vaccinate', 'COVID-19', 'meet', 'person', 'Grammarly', 'business', 'work', 'North', 'America', 'hub', 'location', 'qualified', 'candidate', 'North', 'America', 'vaccinate', 'medical', 'reason', 'sincerely', 'hold', 'religious', 'belief', 'request', 'reasonable', 'accommodation', 'policy', 'Europe', 'team', 'member', 'meet', 'person', 'official', 'Grammarly', 'business', 'work', 'hub', 'location', 'strongly', 'encouraged', 'vaccinate', 'provide', 'proof', 'covid-19', 'vaccination', 'LI', 'Hybrid'], ['Job', 'Requisition', 'ID', 'Position', 'Overconservative', 'Customer', 'Experience', 'Analytics', 'team', 'look', 'passionate', 'driven', 'Data', 'Analyst', 'prove', 'experience', 'drive', 'organizational', 'change', 'rigorous', 'data', 'analysis', 'role', 'perform', 'deep', 'datum', 'mining', 'statistical', 'behavioral', 'analysis', 'Autodesk', 'customer', 'datum', 'partner', 'internal', 'business', 'stakeholder', 'refine', 'key', 'success', 'measure', 'find', 'insight', 'drive', 'program', 'efficiency', 'play', 'significant', 'role', 'shape', 'customer', 'focus', 'data', 'drive', 'culture', 'join', 'dynamic', 'team', 'help', 'transform', 'business', 'decision', 'process', 'actionable', 'customer', 'insight', 'gain', 'meaningful', 'research', 'analysis', 'measurement', 'Autodesk', 'customer', 'experience', 'Job', 'title', 'Lead', 'Business', 'Intelligence', 'Analyst', 'Location', 'San', 'Francisco', 'California', 'ResponsibilitiesWork', 'collaboratively', 'cross', 'functionally', 'define', 'meet', 'stakeholder', 'requirement', 'translate', 'business', 'objective', 'technical', 'datum', 'requirement', 'balance', 'technical', 'feasibility', 'recommend', 'change', 'development', 'maintenance', 'platform', 'standard', 'necessary', 'perform', 'deep', 'dive', 'analysis', 'understand', 'trend', 'anomaly', 'insight', 'drive', 'operational', 'improvement', 'craft', 'datum', 'story', 'presentation', 'write', 'summary', 'data', 'visualization', 'accurately', 'outline', 'problem', 'statement', 'provide', 'actionable', 'unbiased', 'intelligence', 'recommendation', 'communicate', 'finding', 'initiative', 'clarity', 'accountability', 'broad', 'organization', 'stakeholder', 'clearly', 'document', 'provenance', 'datum', 'etl', 'logic', 'code', 'develop', 'model', 'proactively', 'identify', 'area', 'analytic', 'effort', 'answer', 'business', 'question', 'drive', 'operational', 'improvement', 'business', 'value', 'spearhead', 'development', 'insight', 'drive', 'tool', 'dashboard', 'provide', 'ongoing', 'support', 'functionality', 'data', 'integrity', 'MinimumRequirements', 'Proficiency', 'SQL', 'query', 'large', 'proficiency', 'Google', 'Analytics', 'Adobe', 'Analyticalally', 'experience', 'Microsoft', 'Excel', 'pivot', 'table', 'advanced', 'modeling', 'create', 'chart', 'graph', 'PowerPointExperience', 'BI', 'development', 'database', 'system', 'Power', 'Bi', 'Looker', 'Tableau', 'Experience', 'scripting', 'language', 'r', 'Python', 'equivalent', 'experience', 'Qubole', 'snowflake', 'equivalent', 'Ideal', 'Candidate', 'extensive', 'experience', 'role', 'combine', 'datum', 'analysis', 'business', 'intelligence', 'research', 'strategy', 'strong', 'problem', 'solve', 'skill', 'sharp', 'business', 'judgment', 'datum', 'curious', 'interested', 'tell', 'story', 'datum', 'detail', 'orient', 'ensure', 'data', 'accuracy', 'consistency', 'exceptional', 'communication', 'skill', 'attentive', 'listener', 'compelling', 'influence', 'new', 'idea', 'respectful', 'differ', 'opinion', 'perspective', 'address', 'difficult', 'problem', 'head', 'challenge', 'status', 'quo', 'adapt', 'change', 'open', 'learn', 'new', 'skill', 'self', 'motivate', 'work', 'independently', 'fast', 'pace', 'environment', 'highly', 'collaborative', 'work', 'cross', 'functionally', 'cultivate', 'relationship', 'colleague', 'stakeholder', 'passionate', 'improve', 'customer', 'experience', 'click', 'learn', 'benefit', 'America', 't', 'Autodesk', 'build', 'diverse', 'workplace', 'inclusive', 'culture', 'people', 'chance', 'imagine', 'design', 'well', 'world', 'Autodesk', 'proud', 'equal', 'opportunity', 'employer', 'consider', 'qualified', 'applicant', 'employment', 'regard', 'race', 'color', 'religion', 'age', 'sex', 'sexual', 'orientation', 'gender', 'gender', 'identity', 'national', 'origin', 'disability', 'veteran', 'status', 'legally', 'protect', 'characteristic', 'consider', 'employment', 'qualified', 'applicant', 'regardless', 'criminal', 'history', 'consistent', 'applicable', 'law', 'exist', 'contractor', 'consultant', 'Autodesk', 'search', 'open', 'job', 'apply', 'internally', 'external', 'site', 'question', 'require', 'support', 'contact', 'Autodesk', 'Careers', 'salary', 'Autodesk', 'competitive', 'package', 'America', '-base', 'role', 'expect', 'starting', 'base', 'salary', 'dollar109500', 'dollar187770', 'offer', 'base', 'candidate', 'experience', 'geographic', 'location', 'exceed', 'range', 'addition', 'base', 'salary', 'significant', 'emphasis', 'annual', 'cash', 'bonus', 'commission', 'sale', 'role', 'stock', 'grant', 'comprehensive', 'benefit', 'package', 'Summary', 'Location', 'San', 'Francisco', 'California', 'America', 'Type'], ['Optum', 'global', 'organization', 'deliver', 'care', 'aid', 'technology', 'help', 'million', 'people', 'live', 'healthy', 'life', 'work', 'team', 'directly', 'improve', 'health', 'outcome', 'connect', 'people', 'care', 'pharmacy', 'benefit', 'datum', 'resource', 'need', 'feel', 'good', 'find', 'culture', 'guide', 'diversity', 'inclusion', 'talented', 'peer', 'comprehensive', 'benefit', 'career', 'development', 'opportunity', 'come', 'impact', 'community', 'serve', 'help', 'advance', 'health', 'equity', 'global', 'scale', 'join', 'start', 'Caring', 'connect', 'grow', 'Business', 'Intelligence', 'Analyst', 'opportunity', 'focused', 'team', 'dedicate', 'help', 'health', 'care', 'system', 'work', 'well', 'position', 'Reporting', 'Data', 'Management', 'team', 'support', 'Rally', 'Health', 'Prevention', 'relate', 'program', 'datum', 'role', 'involve', 'gathering', 'analyze', 'evaluate', 'business', 'requirement', 'reporting', 'data', 'integration', 'solution', 'request', 'business', 'client', 'involve', 'assess', 'feasibility', 'provide', 'request', 'information', 'base', 'analysis', 'datum', 'datum', 'warehouse', 'source', 'system', 'create', 'detailed', 'functional', 'technical', 'specification', 'Development', 'Quality', 'Assurance', 'team', 'assist', 'datum', 'reporting', 'issue', 'research', 'need', 'primary', 'contributor', 'subject', 'matter', 'expert', 'relate', 'project', 'effort', 'responsible', 'gather', 'effort', 'estimate', 'manage', 'timeline', 'business', 'communication', 'intake', 'deployment', 'enjoy', 'flexibility', 'work', 'remotely', 'America', 'tough', 'challenge', 'primary', 'responsibility', 'collaborate', 'requester', 'gather', 'analyze', 'document', 'business', 'requirement', 'relate', 'reporting', 'data', 'integration', 'solution', 'communicate', 'design', 'stakeholder', 'vary', 'level', 'organization', 'present', 'evaluate', 'design', 'solution', 'objectively', 'facilitate', 'conflict', 'resolution', 'analyze', 'business', 'intelligence', 'requirement', 'Eg', 'standard', 'ad', 'hic', 'reporting', 'dashboard', 'scorecard', 'visualization', 'analytic', 'create', 'functional', 'design', 'base', 'business', 'requirement', 'identify', 'applicable', 'source', 'datum', 'perform', 'datum', 'profiling', 'ensure', 'data', 'availability', 'completeness', 'accuracy', 'consistency', 'assess', 'data', 'management', 'requirement', 'process', 'capability', 'eg', 'governance', 'archiving', 'metadata', 'master', 'datum', 'datum', 'modeling', 'data', 'quality', 'review', 'analyze', 'update', 'project', 'estimate', 'timeline', 'dependency', 'deliverable', 'ensure', 'achievement', 'project', 'goal', 'collaborate', 'Data', 'Architect', 'development', 'team', 'member', 'design', 'document', 'specification', 'datum', 'flow', 'eg', 'data', 'acquisition', 'formatting', 'dependency', 'appropriate', 'datum', 'model', 'eg', 'relational', 'dimensional', 'logical', 'physical', 'new', 'subject', 'area', 'ensure', 'compliance', 'database', 'datum', 'modeling', 'standard', 'naming', 'convention', 'Eg', 'Enterprise', 'Logical', 'Data', 'Model', 'provide', 'feedback', 'appropriate', 'define', 'document', 'datum', 'configuration', 'datum', 'validation', 'rule', 'transformation', 'logic', 'collaboration', 'business', 'partner', 'conduct', 'require', 'design', 'review', 'update', 'design', 'document', 'appropriate', 'obtain', 'appropriate', 'sign', 'off', 'review', 'troubleshoot', 'potential', 'issue', 'phase', 'project', 'collaborate', 'qa', 'team', 'ensure', 'testing', 'effort', 'align', 'system', 'delivery', 'business', 'process', 'identify', 'scenario', 'require', 'performance', 'testing', 'communicate', 'appropriate', 'testing', 'group', 'communicate', 'ensure', 'understanding', 'change', 'applicable', 'group', 'order', 'facilitate', 'ongoing', 'stability', 'provide', 'ongoing', 'expertise', 'operational', 'support', 'group', 'order', 'address', 'identify', 'problem', 'issue', 'request', 'ensure', 'operational', 'documentation', 'update', 'include', 'system', 'maintenance', 'update', 'change', 'work', 'closely', 'business', 'team', 'strategist', 'solution', 'support', 'long', 'term', 'business', 'objective', 'ensure', 'deliverable', 'align', 'business', 'requirement', 'measurable', 'result', 'act', 'liaison', 'business', 'development', 'quality', 'assurance', 'delivery', 'project', 'management', 'group', 'manage', 'research', 'address', 'incoming', 'datum', 'reporting', 'issue', 'arise', 'reward', 'recognize', 'performance', 'environment', 'challenge', 'clear', 'direction', 'take', 'succeed', 'role', 'provide', 'development', 'role', 'interested', 'Required', 'Qualifications', 'undergraduate', 'degree', 'equivalent', 'work', 'experience', '+', 'year', 'experience', 'gathering', 'define', 'business', 'requirement', '+', 'year', 'experience', 'relational', 'database', 'system', 'database', 'concept', '+', 'year', 'experience', 'work', 'datum', 'datum', 'analysis', 'SQL', 'Structured', 'Query', 'Language', 'Proficiency', 'MS', 'Office', 'Word', 'Excel', 'Outlook', 'PowerPoint', 'Access', 'demonstrate', 'ability', 'manage', 'multiple', 'project', 'concurrently', 'preferred', 'Qualifications', '+', 'year', 'client', 'facing', 'role', 'internal', 'external', 'client', '+', 'year', 'experience', 'datum', 'warehousing', 'dimensional', 'modeling', 'concept', 'experience', 'data', 'integration', 'concept', 'experience', 'work', 'cloud', 'data', 'platform', 'experience', 'work', 'agile', 'environment', 'familiarity', 'agile', 'methodology', 'prove', 'experience', 'drive', 'change', 'operation', 'prove', 'success', 'change', 'management', 'control', 'experience', 'large', 'enterprise', 'environment', 'health', 'care', 'experience', 'Project', 'Management', 'experience', 'proficient', 'MS', 'Visio', 'demonstrate', 'excellent', 'write', 'verbal', 'communication', 'skill', 'ability', 'communicate', 'diplomatically', 'colleague', 'client', 'functional', 'area', 'prove', 'proactive', 'problem', 'solver', 'critical', 'thinker', 'excellent', 'planning', 'coordination', 'organizational', 'skill', 'career', 'Optum', 'objective', 'health', 'care', 'simple', 'effective', 'hand', 'work', 'aspect', 'health', 'play', 'role', 'create', 'healthy', 'world', 'insight', 'connection', 'person', 'time', 'bring', 'great', 'mind', 'idea', 'health', 'care', 'full', 'potential', 'promote', 'health', 'equity', 'accessibility', 'work', 'diverse', 'engaged', 'high', 'perform', 'team', 'help', 'solve', 'important', 'challenge', 'California', 'Colorado', 'Connecticut', 'Nevada', 'New', 'York', 'Rhode', 'Island', 'Washington', 'resident', 'salary', 'range', 'California', 'Colorado', 'Connecticut', 'Nevada', 'New', 'York', 'Rhode', 'Island', 'Washington', 'resident', 'dollar67800', 'dollar133100', 'Pay', 'base', 'factor', 'include', 'limit', 'education', 'work', 'experience', 'certification', 'etc', 'addition', 'salary', 'United', 'Health', 'Group', 'offer', 'benefit', 'comprehensive', 'benefit', 'package', 'incentive', 'recognition', 'program', 'equity', 'stock', 'purchase', '401k', 'contribution', 'benefit', 'subject', 'eligibility', 'requirement', 'matter', 'begin', 'career', 'United', 'Health', 'Group', 'find', 'far', 'reach', 'choice', 'benefit', 'incentive', 'employee', 'work', 'remotely', 'require', 'adhere', 'United', 'Health', 'Group', 'Telecommuter', 'Policy', 'United', 'Health', 'Group', 'mission', 'help', 'people', 'live', 'healthy', 'life', 'health', 'system', 'work', 'well', 'believe', 'race', 'gender', 'sexuality', 'age', 'location', 'income', 'deserve', 'opportunity', 'live', 'healthy', 'life', 'today', 'far', 'barrier', 'good', 'health', 'disproportionately', 'experience', 'people', 'color', 'historically', 'marginalize', 'group', 'low', 'income', 'committed', 'mitigate', 'impact', 'environment', 'enable', 'deliver', 'equitable', 'care', 'address', 'health', 'disparity', 'improve', 'health', 'outcome', 'enterprise', 'priority', 'reflect', 'mission', 'diversity', 'create', 'healthy', 'atmosphere', 'United', 'Health', 'Group', 'Equal', 'Employment', 'Opportunity', 'Affirmative', 'Action', 'employer', 'qualified', 'applicant', 'receive', 'consideration', 'employment', 'regard', 'race', 'color', 'religion', 'sex', 'age', 'national', 'origin', 'protect', 'veteran', 'status', 'disability', 'status', 'sexual', 'orientation', 'gender', 'identity', 'expression', 'marital', 'status', 'genetic', 'information', 'characteristic', 'protect', 'law', 'United', 'Health', 'Group', 'drug', 'free', 'workplace', 'candidate', 'require', 'pass', 'drug', 'test', 'begin', 'employment', 'recommend', 'Skills', 'Agile', 'Methodology', 'Business', 'Intelligence', 'Business', 'Planning', 'Business', 'Processes', 'business', 'Requirements', 'Change', 'Management'], ['Job', 'description', 'Asset', 'Mark', 'lead', 'strategic', 'provider', 'innovative', 'investment', 'consulting', 'solution', 'serve', 'independent', 'financial', 'advisor', 'provide', 'investment', 'relationship', 'practice', 'management', 'solution', 'advisor', 'use', 'help', 'client', 'achieve', 'wealth', 'independence', 'purpose', 'consider', 'candidate', 'position', 'able', 'accommodate', 'hybrid', 'work', 'schedule', 'close', 'follow', 'office', 'Phoenix', 'AZ', 'Concord', 'Californiaor', 'Chicago', 'IL', 'Job', 'Asset', 'Mark', 'seek', 'Business', 'Intelligence', 'Analyst', 'Operations', 'Intelligence', 'team', 'hand', 'role', 'person', 'responsible', 'partnering', 'department', 'business', 'unit', 'implement', 'data', 'solution', 'provide', 'robust', 'insight', 'business', 'include', 'create', 'process', 'receive', 'transfer', 'datum', 'business', 'partner', 'system', 'transform', 'combine', 'datum', 'source', 'design', 'schema', 'datum', 'develop', 'critical', 'metric', 'deliver', 'reporting', 'solution', 'individual', 'capable', 'interface', 'level', 'organization', 'achieve', 'success', 'partner', 'Operations', 'Finance', 'business', 'function', 'develop', 'data', 'solution', 'ensure', 'alignment', 'Asset', 'Mark', 'business', 'strategy', 'overall', 'data', 'strategy', 'analyze', 'business', 'need', 'develop', 'metric', 'design', 'develop', 'data', 'model', 'create', 'reporting', 'dashboard', 'solution', 'metric', 'available', 'business', 'user', 'analyze', 'datum', 'identify', 'implement', 'new', 'datum', 'reporting', 'solution', 'provide', 'meaningful', 'insight', 'improve', 'effectiveness', 'business', 'area', 'design', 'develop', 'process', 'extract', 'datum', 'receive', 'internal', 'external', 'source', 'transform', 'meaningful', 'datum', 'element', 'relevant', 'business', 'combine', 'data', 'element', 'load', 'type', 'database', 'schema', 'manage', 'production', 'support', 'datum', 'relate', 'process', 'ensure', 'business', 'continuity', 'performance', 'integrity', 'report', 'dashboard', 'business', 'visible', 'data', 'solution', 'work', 'collaboratively', 'team', 'internal', 'business', 'partner', 'external', 'vendor', 'identify', 'understand', 'implement', 'support', 'new', 'business', 'capability', 'business', 'intelligence', 'Knowledge', 'Skills', 'ability', 'strong', 'business', 'data', 'analysis', 'skill', 'able', 'understand', 'complex', 'business', 'process', 'develop', 'meaningful', 'insightful', 'datum', 'rich', 'solution', 'rely', 'infrastructure', 'heavy', 'technology', 'platform', 'possess', 'excellent', 'communication', 'oral', 'written', 'problem', 'solve', 'skill', 'self', 'motivate', 'detail', 'orient', 'able', 'multi', 'task', 'comfortable', 'fast', 'pace', 'demanding', 'dynamic', 'work', 'environment', 'Education', 'experience', 'extensive', 'experience', 'SQL', 'SQL', 'Server', 'r2', 'year', 'relevant', 'business', 'intelligence', 'experience', 'work', 'financial', 'service', 'energy', 'marketing', 'advertising', 'mobile', 'gaming', 'healthcare', 'biotech', 'retail', 'consumer', 'good', 'software', 'hardware', 'related', 'industry', 'year', 'experience', 'manipulate', 'large', 'data', 'set', 'ability', 'analyze', 'datum', 'develop', 'technology', 'solution', 'experience', 'work', 'type', 'etl', 'software', 'directly', 'indirectly', 'relate', 'business', 'intelligence', 'solution', 'proficient', 'visualization', 'reporting', 'solution', 'preferably', 'Power', 'BI', 'experience', 'enterprise', 'data', 'warehouse', 'large', 'datum', 'warehousing', 'tool', 'preferably', 'Azure', 'Compensation', 'Base', 'Salary', 'range', 'position', 'dollar65000', 'dollar75000', 'information', 'reflect', 'base', 'salary', 'range', 'Asset', 'Mark', 'reasonably', 'expect', 'pay', 'position', 'base', 'number', 'factor', 'include', 'job', 'relate', 'knowledge', 'skill', 'education', 'experience', 'actual', 'work', 'location', 'position', 'eligible', 'additional', 'variable', 'incentive', 'compensation', 'competitive', 'benefit', 'candidate', 'legally', 'authorize', 'work', 'America', 'consider', 'unable', 'provide', 'visa', 'sponsorship', 'position', 'offer', 'Asset', 'Mark', 'mission', 'center', 'help', 'financial', 'advisor', 'difference', 'life', 'client', 'help', 'aim', 'provide', 'advisor', 'holistic', 'support', 'offer', 'compelling', 'technology', 'facilitate', 'well', 'client', 'experience', 'consult', 'service', 'ensure', 'advisor', 'business', 'run', 'good', 'comprehensive', 'suite', 'investment', 'solution', 'Asset', 'Mark', 'platform', 'empower', 'advisor', 'provide', 'high', 'level', 'service', 'possible', 'client', 'Asset', 'Mark', 'culture', 'drive', 'mission', 'connect', 'value', 'Heart', 'Integrity', 'Excellence', 'Respect', 'join', 'team', 'live', 'value', 'day', 'good', 'right', 'encourage', 'different', 'idea', 'continual', 'success', 'innovation', 'additionally', 'offer', 'wide', 'range', 'benefit', 'meet', 'need', 'team', 'member', 'family', 'Flex', 'Time', 'Paid', 'Time', 'Sick', 'Time', 'K', 'percent', 'Employer', 'Match', 'medical', 'Dental', 'Vision', 'HDHP', 'PPO', 'HSA', 'Employer', 'contribution', 'HDHP', 'Volunteer', 'Time', 'Career', 'Development', 'Recognition', 'Fitness', 'Reimbursement', 'hybrid', 'Work', 'schedule', 'Equal', 'Opportunity', 'Employer', 'Asset', 'Mark', 'committed', 'build', 'diverse', 'inclusive', 'workplace', 'feels', 'value', 'Recommended', 'Skills', 'investigation', 'Security', 'Management', 'Law', 'Enforcement', 'Requisition', 'Management', 'diversification', 'Investing', 'Loss', 'Prevention']]\n",
      "开始创建语料库~\n",
      "LDA主题建模~\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/9 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练LDA模型\n"
     ]
    }
   ],
   "source": [
    "files = glob.glob(\"../Data/数据清洗/*\")\n",
    "for file in files:\n",
    "    file_name = file.replace('\\\\', '/').split('/')[-1].split('.')[0]\n",
    "    print(f\"当前网站: {file_name}\")\n",
    "    df = pd.read_csv(file, index_col=0)\n",
    "    if re.search('[\\u4e00-\\u9fa5]', file_name):  # 中文网站\n",
    "        pass\n",
    "    else:  # 英文网站\n",
    "        EnLDA(df).run()\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9f2e740-75e6-44d0-ada3-d4dffa3bb655",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "当前网站: boss直聘\n",
      "当前网站: CareerBuilder\n",
      "当前网站: CIA\n",
      "当前网站: DNI\n",
      "当前网站: linkedin\n",
      "当前网站: Simplyhired\n",
      "当前网站: 智联招聘\n",
      "1676 条数据\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 28%|██▊       | 14/50 [02:41<07:14, 12.08s/it]"
     ]
    }
   ],
   "source": [
    "# -*- encoding: utf-8 -*-\n",
    "# @Time       :  15:48\n",
    "# @Author     : yuxian\n",
    "# @Email      : 1503889663@qq.com\n",
    "# @File       : LDA主题建模.py\n",
    "# @SoftWare   : PyCharm en_core_web_sm\n",
    "import glob\n",
    "import re\n",
    "\n",
    "import pandas as pd\n",
    "import spacy\n",
    "import gensim\n",
    "from gensim.models import Word2Vec, LdaModel\n",
    "from gensim.models import CoherenceModel\n",
    "import pyLDAvis.gensim_models\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 加载Spacy分词模型\n",
    "nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n",
    "\n",
    "\n",
    "# 清洗文本并进行分词\n",
    "def preprocess_text(text):\n",
    "    doc = nlp(text)\n",
    "    return [token.lemma_ for token in doc if token.is_alpha]\n",
    "\n",
    "\n",
    "# 负采样的连续词袋的跳字模式\n",
    "def generate_skipgrams(texts, window_size=5, negative_samples=5):\n",
    "    for text in texts:\n",
    "        for i, target_word in enumerate(text):\n",
    "            for j in range(max(0, i - window_size), min(len(text), i + window_size + 1)):\n",
    "                if i != j:\n",
    "                    yield target_word, text[j], [text[k] for k in range(max(0, i - negative_samples),  min(len(text), i + negative_samples + 1)) if  k != i]\n",
    "\n",
    "\n",
    "# 构建Word2Vec模型\n",
    "def build_word2vec_model(texts, vector_size=100, window=5, min_count=1, workers=4):\n",
    "    word2vec_model = Word2Vec(sentences=texts, vector_size=vector_size, window=window, min_count=min_count, workers=workers)\n",
    "    return word2vec_model\n",
    "\n",
    "\n",
    "# 构建LDA模型\n",
    "def build_lda_model(corpus, dictionary, num_topics):\n",
    "    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)\n",
    "    return lda_model\n",
    "\n",
    "\n",
    "# 寻找最佳主题数量\n",
    "def find_best_topic_number(corpus, dictionary, texts, max_topics=10):\n",
    "    coherence_scores = []\n",
    "    perplexity_scores = []\n",
    "\n",
    "    for num_topics in tqdm(range(1, max_topics + 1)):\n",
    "        lda_model = build_lda_model(corpus, dictionary, num_topics)\n",
    "\n",
    "        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')\n",
    "        coherence_score = coherence_model.get_coherence()\n",
    "        coherence_scores.append(coherence_score)\n",
    "\n",
    "        perplexity_score = lda_model.log_perplexity(corpus)\n",
    "        perplexity_scores.append(perplexity_score)\n",
    "\n",
    "    # 画图展示\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    plt.subplot(1, 2, 1)\n",
    "    plt.plot(range(1, max_topics + 1), coherence_scores, marker='o')\n",
    "    plt.title('Coherence Score vs. Number of Topics')\n",
    "    plt.xlabel('Number of Topics')\n",
    "    plt.ylabel('Coherence Score')\n",
    "\n",
    "    plt.subplot(1, 2, 2)\n",
    "    plt.plot(range(1, max_topics + 1), perplexity_scores, marker='o')\n",
    "    plt.title('Perplexity Score vs. Number of Topics')\n",
    "    plt.xlabel('Number of Topics')\n",
    "    plt.ylabel('Perplexity Score')\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "    # 寻找最佳主题数量\n",
    "    best_num_topics = range(1, max_topics + 1)[coherence_scores.index(max(coherence_scores))]\n",
    "    return best_num_topics\n",
    "\n",
    "\n",
    "# 主函数\n",
    "def main(texts):\n",
    "    # 对文本进行分词和预处理\n",
    "    processed_texts = [preprocess_text(text) for text in texts]\n",
    "\n",
    "    # # 生成Skip-grams\n",
    "    # skipgrams = list(generate_skipgrams(processed_texts))\n",
    "\n",
    "    # 构建Word2Vec模型\n",
    "    word2vec_model = build_word2vec_model(processed_texts)\n",
    "\n",
    "    # # 获取Word2Vec模型的权重\n",
    "    # weights = word2vec_model.wv\n",
    "\n",
    "    # 构建LDA模型所需的语料库和字典\n",
    "    dictionary = gensim.corpora.Dictionary(processed_texts)\n",
    "    corpus = [dictionary.doc2bow(text) for text in processed_texts]\n",
    "\n",
    "    # 寻找最佳主题数量\n",
    "    best_num_topics = find_best_topic_number(corpus, dictionary, processed_texts, max_topics=50)\n",
    "\n",
    "    # 构建最终的LDA模型\n",
    "    final_lda_model = build_lda_model(corpus, dictionary, best_num_topics)\n",
    "\n",
    "    # 可视化主题\n",
    "    pyLDAvis.enable_notebook()\n",
    "    vis = pyLDAvis.gensim_models.prepare(final_lda_model, corpus, dictionary)\n",
    "    pyLDAvis.show(vis)\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    files = glob.glob(\"../Data/数据清洗/*\")\n",
    "    En_data = list()\n",
    "    for file in files:\n",
    "        file_name = file.replace('\\\\', '/').split('/')[-1].split('.')[0]\n",
    "        print(f\"当前网站: {file_name}\")\n",
    "        df = pd.read_csv(file, index_col=0)\n",
    "        if re.search('[\\u4e00-\\u9fa5]', file_name):  # 中文网站\n",
    "            pass\n",
    "        else:  # 英文网站\n",
    "            En_data += df[\"任职要求\"].tolist()\n",
    "    print(len(En_data), \"条数据\")\n",
    "    main(En_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00d2bf71-4877-4fd9-8c1c-960f0c98096b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
